# coding=utf-8
# __author__ = 'chenbinghui'
#!/usr/bin/python
# -*- coding: UTF-8 -*-

import re
import urllib
import urllib2
import ToolUtil

class TBMM_Crawler:
    #初始化页面
    def __init__(self):
        self.siteUrl = 'http://mm.taobao.com/json/request_top_list.htm'
        self.tool = ToolUtil.ToolUtil()
    #获取索引页面的内容
    def getPage(self,pageIndex):
        url = self.siteUrl + '?page='+str(pageIndex)
        request = urllib2.Request(url)
        response = urllib2.urlopen(request)
        return response.read().decode('gbk')
    #获取索引界面所有的MM信息，list格式
    def getContent(self,pageIndex):
        page = self.getPage(pageIndex)
        pattern = re.compile('<div class="list-item.*?<div class="pic-word.*?<a class="lady-name" href="(.*?)".*?>(.*?)</a>.*?<strong>'+
                             '(.*?)</strong>.*?<span>(.*?)</span>',re.S)
        items = re.findall(pattern,page)
        contents = []
        for item in items:
            contents.append([item[0],item[1],item[2],item[3]])
        return contents
    #获取mm个人相册页面
    def getDetailPage(self,infoUrl):
        response = urllib2.urlopen('https:'+infoUrl)
        page = response.read().decode('gbk')
        pattern = re.compile('<ul class="mm-p-menu.*?<li >.*?<span.*?<a href="(.*?)" >', re.S)
        item = re.findall(pattern, page)
        return item[0]
    #获取相册菜单中的所有类型的相册
    def getPhotoList(self,photoUrl):
        response = urllib2.urlopen('https:' + photoUrl)
        page = response.read().decode('gbk')
        pattern = re.compile('<div class="mm-photo-list.*?<div class="mm-photo-cell.*?<h4.*?<a href="(.*?)" target="_blank">(.*?)</a>.*?'+
                             '<span class="mm-pic-number">(.*?)</span>.*?<p class="mm-photo-date">(.*?)</p>', re.S)
        items = re.findall(pattern, page)
        for item in items:
            print item[0],item[1],item[2],item[3]







tbmm = TBMM_Crawler()
contents = tbmm.getContent(1)
photoUrl = tbmm.getDetailPage(contents[0][0])
tbmm.getPhotoList(photoUrl)
